This notebook documents the bottom-up strategy experimentation to determine notebook similarity. It is based on the notion that it is easier to aggregate than to break down a 'black box.'
The biggest challenge is working with the AST structure. Because it is a tree, we need to merge leafs with their parents, working our way up.
There are two main goals:
In [1]:
    
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary
from nbminer.features.featurize.ast_graph.ast_graph import *
    
In [2]:
    
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)
    
In [3]:
    
for i, nb in enumerate(a.nb_features):
    a.nb_features[i] = nb.get_new_notebook()
    
In [ ]:
    
    
In [4]:
    
graphs = []
for nb in a.nb_features:
    for cell in nb.get_all_cells():
        graphs.append(cell.get_feature('graph'))
agr = ASTGraphReducer(graphs)
num_nodes = []
for g in agr.graphs:
    num_nodes.append(g.graph_nodes())
print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(num_nodes, bins=30)
    
    
    Out[4]:
    
In [5]:
    
cur_count = 0
new_count = 1
print (agr.count_nodes())
while cur_count != new_count:
    cur_count = new_count
    new_count = (agr.count_nodes())
    agr.build_relations()
print (new_count)
    
    
In [6]:
    
num_nodes = []
for g in agr.graphs:
    num_nodes.append(g.graph_nodes())
print ('Total number of graphs:',agr.number_graphs())
print ('Total number of graphs with one node:',agr.number_single())
print ('Total number of nodes:',agr.count_nodes())
print (agr.count_nodes())
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(num_nodes, bins=30)
    
    
    Out[6]:
    
In [7]:
    
# Similarity between nb 0 and all other notebooks:
print (sorted([similarity[1][1] for similarity in a.notebook_jaccard_similarity(0)]))
    
    
In [8]:
    
# Maximum similarity
all_sims = []
max_sim = 0
max_val = None
for i in range(len(a.nb_features)):
    for similarity in a.notebook_jaccard_similarity(i):
        if similarity[1][1] > max_sim:
            max_sim = similarity[1][1]
            max_val = (i, similarity[0])
max_sim, max_val
    
    Out[8]:
In [9]:
    
a.nb_features[2].notebook.filename
    
    Out[9]:
In [10]:
    
a.nb_features[3].notebook.filename
    
    Out[10]:
Now we're interested in what happened with this bottom up approach. What does the final thing look like? We can print out each graph and get a sense of what's happened, then we can look at some actual code, what it looks like in graph format, and what the black boxes it holds actually mean
In [21]:
    
for cell in a.nb_features[25].get_all_cells():
    print (cell.get_feature('graph').get_nodes())
    
    
In [22]:
    
for cell in a.nb_features[39].get_all_cells():
    print (cell.get_feature('graph').get_nodes())
    
    
In [13]:
    
cells = []
for nb in a.nb_features:
    cells.extend([cell for cell in nb.get_all_cells()])
groups = []
cur_code = ''
cur_group = []
for cell in cells:
    if cell.get_feature('original_code') == cur_code:
        cur_group.append(cell)
    else:
        if len(cur_group) > 0:
            groups.append(cur_group)
        cur_group = []
    cur_code = cell.get_feature('original_code')
    
In [14]:
    
group = 6
print ('*'*50)
print ('Black Boxes')
for cell in groups[group]:
    print (cell.get_feature('graph').get_nodes())
print ('*'*50)
print ('Code')
print (groups[group][0].get_feature('original_code'))
print ('*'*50)
print ('Black Box meaning')
for cell in groups[group]:
    n = (cell.get_feature('graph').get_nodes())
    if len(n) == 1 and n[0][:5] == 'black':
        print (agr.get_trace(n[0]))
    
    
In [31]:
    
print (agr.get_trace('black_box1288'))
    
    
In [24]:
    
for key in agr.names.keys():
    if 'Call' in key:
        print (key)
    
    
In [17]:
    
graph_sets = []
for nb in a.nb_features:
    graph_set = []
    for cell in nb.get_all_cells():
        graph_set.append(cell.get_feature('graph'))
    graph_sets.append(graph_set)
    
In [18]:
    
agc = ASTGraphCombiner(graph_sets)
    
In [19]:
    
print ('before',agc.count_graphs())
agc.reduce_graphs()
print ('after',agc.count_graphs())
print ('total_distinct',agc.count_distinct_nodes())
    
    
In [20]:
    
for graph in agc.graph_sets[0]:
    print (graph.get_nodes())
    
    
In [ ]: